%load_ext autoreload
%autoreload 2
from pathlib import Path
import os
import pandas as pd
import numpy as np
import sys
sys.path.append("../../")
DATA_PATH = Path('../../../data/')
MODEL_PATH = Path('../../../models/keras2lwtnn')
width = 6.693
height = 9.466
import matplotlib as mpl
mpl.use('pdf')
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rc('font', family='serif', serif='Computer Modern Roman')
plt.rc('text', usetex=True)
# plt.rc('xtick', labelsize=8)
# plt.rc('ytick', labelsize=8)
# plt.rc('axes', labelsize=8)
# plt.rc('figure', titlesize=10)
from sklearn.model_selection import train_test_split
k = 1
data = pd.read_hdf(DATA_PATH / 'train_data.h5', 'train_set').sample(frac=k, random_state=137)
unused_features = [
'seed_nbIT', 'seed_nLayers', 'seed_mva_value', 'seed_nLHCbIDs',
'is_downstream_reconstructible_not_electron', 'is_true_seed',
'has_MCParticle_not_electron', 'has_MCParticle'
]
label_names = [
'is_downstream_reconstructible'
]
data.head()
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
x_train = train_set.drop(label_names + unused_features, axis=1)
y_train = train_set[label_names].copy().astype(np.int32)
x_test = test_set.drop(label_names + unused_features, axis=1)
y_test = test_set[label_names].copy().astype(np.int32)
def latex_rename(df):
d = {column:column.replace("_", "\_") for column in df.columns}
return df.rename(columns=d)
latex_rename(x_train).hist(bins=25, figsize=(width, height), layout=(5,2))
# plt.savefig("features_hist.pdf")
temp_df = train_set.drop(label_names + ['seed_mva_value','is_downstream_reconstructible_not_electron', 'is_true_seed',
'has_MCParticle_not_electron', 'has_MCParticle'] , axis=1)
temp_df['log10_seed_p'] = np.log10(temp_df['seed_p'])
temp_df['log10_seed_pt'] = np.log10(temp_df['seed_pt'])
del temp_df['seed_pt']
del temp_df['seed_p']
latex_rename(temp_df).hist(bins=100, figsize=(width, height),layout=(5,2))
plt.tight_layout()
# plt.savefig("features_hist.pdf")
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=42, n_jobs=3, max_depth=20, class_weight='balanced')
forest.fit(x_train, y_train.values.ravel())
from sklearn.metrics import roc_auc_score
print("train roc:{:.5}".format(roc_auc_score(y_train, forest.predict_proba(x_train)[:,1])))
print("train acc: {:.5}".format(forest.score(x_train, y_train)))
print("test roc:{:.5}".format(roc_auc_score(y_test, forest.predict_proba(x_test)[:,1])))
print("test acc: {:.5}".format(forest.score(x_test, y_test)))
architecture_file = (MODEL_PATH / 'lwtnn_v1_13' / 'architecture.json').as_posix()
weights_file = (MODEL_PATH / 'lwtnn_v1_13' / 'weights.h5').as_posix()
with open(architecture_file) as json_file:
json_string = json_file.readlines()[0]
from keras.models import model_from_json
model = model_from_json(json_string)
model.summary()
model.load_weights(weights_file)
new_data = pd.read_hdf(DATA_PATH / 'test_data.h5', 'test_set')
new_data = new_data.drop(unused_features, axis=1)
x = new_data.drop(label_names, axis=1)
y = new_data[label_names].copy().astype(np.int32)
latex_rename(x).hist(bins=50, figsize=(width, height), layout=(5,2))
weights = np.ones_like(y.values)/float(len(y.values))
plt.figure(figsize=(width/1.75,width/2.5))
plt.hist(y.values, weights=weights)
plt.title(latex_rename(y).columns[0])
# plt.savefig("class_balance.pdf")
def nn_data_preprocessing(x):
x = x.copy()
x.loc[:, 'seed_angle'] = (np.arctan(x['seed_y'].values/x['seed_x'].values) + 0.00059458703227136336 ) * 1.3499114679506281
x.loc[:, 'seed_pr'] = (np.arctanh(x['seed_pt'].values/x['seed_p'].values) -0.2373458146766905) * 5.4252485447962835
x.loc[:, 'seed_r'] = (np.sqrt(x['seed_y'].values**2 + x['seed_x'].values**2) -712.28415274688871 ) * 0.0019285089504441804
x.loc[:, 'seed_chi2PerDoF'] = (np.sqrt(x['seed_chi2PerDoF'].values) -1.2984056703378601 ) * 2.3229264617966523
x.loc[:, 'seed_p'] = (np.log10(x['seed_p'].values) -3.8730802388444614) * 2.2146322148563526
x.loc[:, 'seed_pt'] = (np.log10(x['seed_pt'].values) -3.0783963469455355 ) * 4.6109558947469971
x.loc[:, 'seed_tx'] = (x['seed_tx'].values + 0.0024932173949750538) * 3.2600086889926385
x.loc[:, 'seed_ty'] = (x['seed_ty'].values + 0.00060514053765673619) * 14.811235511812143
x.loc[:, 'seed_x'] = (x['seed_x'].values -0.19295878936748806) * 0.0014091856183636067
x.loc[:, 'seed_y'] = (x['seed_y'].values + 4.7701536104670419) * 0.0019151935939470761
x = x[['seed_chi2PerDoF','seed_p','seed_pt','seed_x','seed_y','seed_tx','seed_ty','seed_angle','seed_pr','seed_r']].copy()
return x
x_nn = nn_data_preprocessing(x)
temp_df = x_nn.copy()
temp_df['log10_seed_p'] = temp_df['seed_p']
temp_df['log10_seed_pt'] = temp_df['seed_pt']
del temp_df['seed_pt']
del temp_df['seed_p']
latex_rename(temp_df).hist(bins=100, figsize=(width,height), layout=(5,2))
plt.tight_layout()
# plt.savefig("scaled_features.pdf")
x_nn.mean()
y.hist()
model.predict(x_nn.values[:5])
from utils import plot_roc_curve, plot_confusion_matrix
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
print("accuracy - Random Forest")
accuracy_score(y.values, forest.predict(x.values))
print("accuracy - DNN")
accuracy_score(y.values, np.array(model.predict(x_nn.values) > 0.5, dtype=np.int32))
predictions_df = pd.DataFrame(model.predict(x_nn.values), columns=y.columns, index=y.index)
plot_confusion_matrix(
confusion_matrix(y, forest.predict(x)),
classes=["ghost","true"],
title='Normalized confusion matrix - RandomForest',
normalize=True
)
plot_confusion_matrix(
confusion_matrix(y, np.array(predictions_df.values > 0.5, dtype=np.int32)),
classes=["ghost","true"],
title='Normalized confusion matrix - DNN',
normalize=True
)
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y, forest.predict_proba(x)[:,1])
fpr_model, tpr_model, thresholds_model = roc_curve(y, predictions_df.values)
plt.figure(figsize=(8,8))
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plot_roc_curve(fpr_model, tpr_model, "DNN")
plt.legend(loc="lower right")
# with open('roc.csv', 'w') as f:
# print('fpr,tpr', file=f)
# for i, (x,y1) in enumerate(zip(fpr_model, tpr_model)):
# if i%100 == 0:
# print("{},{}".format(x,y1), file=f)
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
def plot_true_positives_and_negatives(
y_true,
probabilities,
normalize=False,
step=0.1,
title='True positives and true negatives vs threshold', ):
thresholds = np.arange(0.0, 1.0, step)
true_positives_rate = np.empty(thresholds.shape)
true_negatives_rate = np.empty(thresholds.shape)
for i, threshold in enumerate(thresholds):
classified_examples = np.array(
probabilities > threshold, dtype=int)
cm = confusion_matrix(y_true, classified_examples)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
true_positives_rate[i] = cm[1, 1]
true_negatives_rate[i] = cm[0, 0]
plots = [
go.Scatter(x=thresholds, y=true_positives_rate, name='true positives'),
go.Scatter(x=thresholds, y=true_negatives_rate, name='true negatives'),
]
layout = go.Layout(
title=title,
xaxis=dict(title='threshold'),
)
fig = go.Figure(data=plots, layout=layout)
py.iplot(fig)
return true_positives_rate, true_negatives_rate, thresholds
plot_true_positives_and_negatives(
y, forest.predict_proba(x)[:,1],
title='Thresholds - RandomForest',
step=1e-2,
normalize=True
)
true_positives_rate, true_negatives_rate, thresholds = plot_true_positives_and_negatives(
y, predictions_df.values,
title='Thresholds - DNN',
step=1e-2,
normalize=True
)
# with open('tpr_fpr.csv', 'w') as f:
# print('threshold,tpr,fpr', file=f)
# for x,y1,y2 in zip(thresholds, true_positives_rate, true_negatives_rate):
# print("{},{},{}".format(x,y1,y2), file=f)
print("ROC score Random Forest")
roc_auc_score(y.values, forest.predict_proba(x)[:,1])
print("ROC score DNN")
roc_auc_score(y, predictions_df.values)
cm = confusion_matrix(y, predictions_df.values > 0.5)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm
cm = confusion_matrix(y, predictions_df.values > 0.245)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm
from sklearn.metrics import log_loss
log_loss(y, predictions_df.values)
log_loss(y, forest.predict_proba(x.values)[:,1])
dnn_pred = pd.DataFrame(model.predict(nn_data_preprocessing(x_train).values), columns=y_train.columns, index=y_train.index)
dnn_pred['decision'] = dnn_pred['is_downstream_reconstructible'] > 0.5
dnn_pred['correct'] = dnn_pred['decision'] == y_train['is_downstream_reconstructible']
dnn_pred.head()
def inv_expit(x):
return -np.log(1.0/x -1.0)
inv_expit(dnn_pred[dnn_pred['correct'] == False]['is_downstream_reconstructible']).hist(bins=100, figsize=(width*1.5,width), alpha=.75, normed=True)
np.max(inv_expit(dnn_pred[dnn_pred['correct'] == False]['is_downstream_reconstructible']))
np.max(inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']))
np.min(inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']))
inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']).hist(bins=100, alpha=0.75, normed=True)
inv_expit(dnn_pred[dnn_pred['correct'] == False]['is_downstream_reconstructible']).hist(bins=100, figsize=(width*1.5,width), alpha=.75, normed=True)
inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']).hist(bins=100, alpha=0.75, normed=True)
plt.title("Wrong decisions")
plt.rc('text', usetex=True)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.rc('axes', labelsize=10)
plt.rc('figure', titlesize=12)
plt.xlabel("$\hat{y}$")
plt.tight_layout()
# plt.savefig("wrong.pdf")
dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible'].hist(bins=100, figsize=(width/2,width/2.5))
plt.title("Correct decisions")
plt.xlabel("$\hat{y}$")
plt.tight_layout()
# plt.savefig("correct.pdf")